Unsupervised Learning - Project

by Sachin Sharma

Import Libraries

In [1]:
import numpy as np   
from sklearn.linear_model import LinearRegression
import pandas as pd    

import matplotlib.pyplot as plt 
%matplotlib inline 

import seaborn as sns
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression 
from sklearn.decomposition import PCA

from sklearn import metrics

from sklearn.naive_bayes import GaussianNB

from sklearn.svm import SVC

from scipy.stats import zscore

Load Table, Data pre-processing

In [2]:
cData = pd.read_csv("Vehicle.csv")  
cData.shape
Out[2]:
(846, 19)
In [3]:
cData.head()
Out[3]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
0 95 48.0 83.0 178.0 72.0 10 162.0 42.0 20.0 159 176.0 379.0 184.0 70.0 6.0 16.0 187.0 197 van
1 91 41.0 84.0 141.0 57.0 9 149.0 45.0 19.0 143 170.0 330.0 158.0 72.0 9.0 14.0 189.0 199 van
2 104 50.0 106.0 209.0 66.0 10 207.0 32.0 23.0 158 223.0 635.0 220.0 73.0 14.0 9.0 188.0 196 car
3 93 41.0 82.0 159.0 63.0 9 144.0 46.0 19.0 143 160.0 309.0 127.0 63.0 6.0 10.0 199.0 207 van
4 85 44.0 70.0 205.0 103.0 52 149.0 45.0 19.0 144 241.0 325.0 188.0 127.0 9.0 11.0 180.0 183 bus
In [4]:
cData.describe().transpose()
Out[4]:
count mean std min 25% 50% 75% max
compactness 846.0 93.678487 8.234474 73.0 87.00 93.0 100.0 119.0
circularity 841.0 44.828775 6.152172 33.0 40.00 44.0 49.0 59.0
distance_circularity 842.0 82.110451 15.778292 40.0 70.00 80.0 98.0 112.0
radius_ratio 840.0 168.888095 33.520198 104.0 141.00 167.0 195.0 333.0
pr.axis_aspect_ratio 844.0 61.678910 7.891463 47.0 57.00 61.0 65.0 138.0
max.length_aspect_ratio 846.0 8.567376 4.601217 2.0 7.00 8.0 10.0 55.0
scatter_ratio 845.0 168.901775 33.214848 112.0 147.00 157.0 198.0 265.0
elongatedness 845.0 40.933728 7.816186 26.0 33.00 43.0 46.0 61.0
pr.axis_rectangularity 843.0 20.582444 2.592933 17.0 19.00 20.0 23.0 29.0
max.length_rectangularity 846.0 147.998818 14.515652 118.0 137.00 146.0 159.0 188.0
scaled_variance 843.0 188.631079 31.411004 130.0 167.00 179.0 217.0 320.0
scaled_variance.1 844.0 439.494076 176.666903 184.0 318.00 363.5 587.0 1018.0
scaled_radius_of_gyration 844.0 174.709716 32.584808 109.0 149.00 173.5 198.0 268.0
scaled_radius_of_gyration.1 842.0 72.447743 7.486190 59.0 67.00 71.5 75.0 135.0
skewness_about 840.0 6.364286 4.920649 0.0 2.00 6.0 9.0 22.0
skewness_about.1 845.0 12.602367 8.936081 0.0 5.00 11.0 19.0 41.0
skewness_about.2 845.0 188.919527 6.155809 176.0 184.00 188.0 193.0 206.0
hollows_ratio 846.0 195.632388 7.438797 181.0 190.25 197.0 201.0 211.0
In [5]:
cData.dtypes
Out[5]:
compactness                      int64
circularity                    float64
distance_circularity           float64
radius_ratio                   float64
pr.axis_aspect_ratio           float64
max.length_aspect_ratio          int64
scatter_ratio                  float64
elongatedness                  float64
pr.axis_rectangularity         float64
max.length_rectangularity        int64
scaled_variance                float64
scaled_variance.1              float64
scaled_radius_of_gyration      float64
scaled_radius_of_gyration.1    float64
skewness_about                 float64
skewness_about.1               float64
skewness_about.2               float64
hollows_ratio                    int64
class                           object
dtype: object
In [6]:
cData.isnull().values.any()
Out[6]:
True
In [7]:
cData.isnull().sum()
Out[7]:
compactness                    0
circularity                    5
distance_circularity           4
radius_ratio                   6
pr.axis_aspect_ratio           2
max.length_aspect_ratio        0
scatter_ratio                  1
elongatedness                  1
pr.axis_rectangularity         3
max.length_rectangularity      0
scaled_variance                3
scaled_variance.1              2
scaled_radius_of_gyration      2
scaled_radius_of_gyration.1    4
skewness_about                 6
skewness_about.1               1
skewness_about.2               1
hollows_ratio                  0
class                          0
dtype: int64
In [8]:
#Replace mising values
cData = cData.replace(' ', np.nan)

for i in cData.columns[:17]:
    m = cData[i].median()
    cData[i] = cData[i].fillna(m)
In [9]:
cData.isnull().values.any()
Out[9]:
False

Since variable 'Class' is category type:

In [10]:
cData['class']=cData['class'].astype('category')

cData.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 19 columns):
compactness                    846 non-null int64
circularity                    846 non-null float64
distance_circularity           846 non-null float64
radius_ratio                   846 non-null float64
pr.axis_aspect_ratio           846 non-null float64
max.length_aspect_ratio        846 non-null int64
scatter_ratio                  846 non-null float64
elongatedness                  846 non-null float64
pr.axis_rectangularity         846 non-null float64
max.length_rectangularity      846 non-null int64
scaled_variance                846 non-null float64
scaled_variance.1              846 non-null float64
scaled_radius_of_gyration      846 non-null float64
scaled_radius_of_gyration.1    846 non-null float64
skewness_about                 846 non-null float64
skewness_about.1               846 non-null float64
skewness_about.2               846 non-null float64
hollows_ratio                  846 non-null int64
class                          846 non-null category
dtypes: category(1), float64(14), int64(4)
memory usage: 120.0 KB

Understanding Attributes

In [11]:
cData['class'].value_counts()
Out[11]:
car    429
bus    218
van    199
Name: class, dtype: int64
In [12]:
sns.countplot(cData['class'])
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a21deae688>
In [13]:
cData.hist(figsize=(18,18))
Out[13]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x000001A21D7E28C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2200A7D08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2200E1148>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A220119288>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001A220152388>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A22018C488>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2201C9FC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A220204E88>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001A220209288>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A220240448>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2202A49C8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2202DFA88>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001A220318BC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A220350D08>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A22038ADC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2203C4EC8>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x000001A220405148>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A220431FC8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A22046F108>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x000001A2204A7308>]],
      dtype=object)
In [14]:
#Outliers
g = cData.boxplot(figsize=(15,15))
g.set_xticklabels(plt.xticks()[1], rotation=60)
Out[14]:
[Text(0, 0, 'compactness'),
 Text(0, 0, 'circularity'),
 Text(0, 0, 'distance_circularity'),
 Text(0, 0, 'radius_ratio'),
 Text(0, 0, 'pr.axis_aspect_ratio'),
 Text(0, 0, 'max.length_aspect_ratio'),
 Text(0, 0, 'scatter_ratio'),
 Text(0, 0, 'elongatedness'),
 Text(0, 0, 'pr.axis_rectangularity'),
 Text(0, 0, 'max.length_rectangularity'),
 Text(0, 0, 'scaled_variance'),
 Text(0, 0, 'scaled_variance.1'),
 Text(0, 0, 'scaled_radius_of_gyration'),
 Text(0, 0, 'scaled_radius_of_gyration.1'),
 Text(0, 0, 'skewness_about'),
 Text(0, 0, 'skewness_about.1'),
 Text(0, 0, 'skewness_about.2'),
 Text(0, 0, 'hollows_ratio')]
In [15]:
for col_name in cData.columns[:-1]:
    q1 = cData[col_name].quantile(0.25)
    q3 = cData[col_name].quantile(0.75)
    iqr = q3 - q1
    
    low = q1-1.5*iqr
    high = q3+1.5*iqr
    
    cData.loc[(cData[col_name] < low) | (cData[col_name] > high), col_name] = cData[col_name].median()
    
g = cData.boxplot(figsize=(15,15))
g.set_xticklabels(plt.xticks()[1], rotation=60)
Out[15]:
[Text(0, 0, 'compactness'),
 Text(0, 0, 'circularity'),
 Text(0, 0, 'distance_circularity'),
 Text(0, 0, 'radius_ratio'),
 Text(0, 0, 'pr.axis_aspect_ratio'),
 Text(0, 0, 'max.length_aspect_ratio'),
 Text(0, 0, 'scatter_ratio'),
 Text(0, 0, 'elongatedness'),
 Text(0, 0, 'pr.axis_rectangularity'),
 Text(0, 0, 'max.length_rectangularity'),
 Text(0, 0, 'scaled_variance'),
 Text(0, 0, 'scaled_variance.1'),
 Text(0, 0, 'scaled_radius_of_gyration'),
 Text(0, 0, 'scaled_radius_of_gyration.1'),
 Text(0, 0, 'skewness_about'),
 Text(0, 0, 'skewness_about.1'),
 Text(0, 0, 'skewness_about.2'),
 Text(0, 0, 'hollows_ratio')]
In [16]:
labelencoder_X = LabelEncoder()
cData['class'] = labelencoder_X.fit_transform(cData['class'])
In [17]:
cData.corr()
Out[17]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio class
compactness 1.000000 0.684887 0.789928 0.721925 0.192864 0.499928 0.812620 -0.788750 0.813694 0.676143 0.769871 0.806170 0.585243 -0.246681 0.197308 0.156348 0.298537 0.365552 -0.033796
circularity 0.684887 1.000000 0.792320 0.638280 0.203253 0.560470 0.847938 -0.821472 0.843400 0.961318 0.802768 0.827462 0.925816 0.068745 0.136351 -0.009666 -0.104426 0.046351 -0.158910
distance_circularity 0.789928 0.792320 1.000000 0.794222 0.244332 0.666809 0.905076 -0.911307 0.893025 0.774527 0.869584 0.883943 0.705771 -0.229353 0.099107 0.262345 0.146098 0.332732 -0.064467
radius_ratio 0.721925 0.638280 0.794222 1.000000 0.650554 0.463958 0.769941 -0.825392 0.744139 0.579468 0.786183 0.760257 0.550774 -0.390459 0.035755 0.179601 0.405849 0.491758 -0.213948
pr.axis_aspect_ratio 0.192864 0.203253 0.244332 0.650554 1.000000 0.150295 0.194195 -0.298144 0.163047 0.147592 0.207101 0.196401 0.148591 -0.321070 -0.056030 -0.021088 0.400882 0.415734 -0.209298
max.length_aspect_ratio 0.499928 0.560470 0.666809 0.463958 0.150295 1.000000 0.490759 -0.504181 0.487931 0.642713 0.401391 0.463249 0.397397 -0.335444 0.081898 0.141664 0.083794 0.413174 0.352958
scatter_ratio 0.812620 0.847938 0.905076 0.769941 0.194195 0.490759 1.000000 -0.971601 0.989751 0.809083 0.960883 0.980447 0.799875 0.011314 0.064242 0.211647 0.005628 0.118817 -0.288895
elongatedness -0.788750 -0.821472 -0.911307 -0.825392 -0.298144 -0.504181 -0.971601 1.000000 -0.948996 -0.775854 -0.947644 -0.948851 -0.766314 0.078391 -0.046943 -0.183642 -0.115126 -0.216905 0.339344
pr.axis_rectangularity 0.813694 0.843400 0.893025 0.744139 0.163047 0.487931 0.989751 -0.948996 1.000000 0.810934 0.947329 0.973606 0.796690 0.027545 0.073127 0.213801 -0.018649 0.099286 -0.258481
max.length_rectangularity 0.676143 0.961318 0.774527 0.579468 0.147592 0.642713 0.809083 -0.775854 0.810934 1.000000 0.750222 0.789632 0.866450 0.053856 0.130702 0.004129 -0.103948 0.076770 -0.032399
scaled_variance 0.769871 0.802768 0.869584 0.786183 0.207101 0.401391 0.960883 -0.947644 0.947329 0.750222 1.000000 0.943780 0.785073 0.025828 0.024693 0.197122 0.015171 0.086330 -0.324062
scaled_variance.1 0.806170 0.827462 0.883943 0.760257 0.196401 0.463249 0.980447 -0.948851 0.973606 0.789632 0.943780 1.000000 0.782972 0.009386 0.065731 0.204941 0.017557 0.119642 -0.279487
scaled_radius_of_gyration 0.585243 0.925816 0.705771 0.550774 0.148591 0.397397 0.799875 -0.766314 0.796690 0.866450 0.785073 0.782972 1.000000 0.215279 0.162970 -0.055667 -0.224450 -0.118002 -0.250267
scaled_radius_of_gyration.1 -0.246681 0.068745 -0.229353 -0.390459 -0.321070 -0.335444 0.011314 0.078391 0.027545 0.053856 0.025828 0.009386 0.215279 1.000000 -0.057755 -0.123996 -0.832738 -0.901332 -0.283540
skewness_about 0.197308 0.136351 0.099107 0.035755 -0.056030 0.081898 0.064242 -0.046943 0.073127 0.130702 0.024693 0.065731 0.162970 -0.057755 1.000000 -0.041734 0.086661 0.062619 0.126720
skewness_about.1 0.156348 -0.009666 0.262345 0.179601 -0.021088 0.141664 0.211647 -0.183642 0.213801 0.004129 0.197122 0.204941 -0.055667 -0.123996 -0.041734 1.000000 0.074473 0.200651 -0.010872
skewness_about.2 0.298537 -0.104426 0.146098 0.405849 0.400882 0.083794 0.005628 -0.115126 -0.018649 -0.103948 0.015171 0.017557 -0.224450 -0.832738 0.086661 0.074473 1.000000 0.892581 0.067244
hollows_ratio 0.365552 0.046351 0.332732 0.491758 0.415734 0.413174 0.118817 -0.216905 0.099286 0.076770 0.086330 0.119642 -0.118002 -0.901332 0.062619 0.200651 0.892581 1.000000 0.235874
class -0.033796 -0.158910 -0.064467 -0.213948 -0.209298 0.352958 -0.288895 0.339344 -0.258481 -0.032399 -0.324062 -0.279487 -0.250267 -0.283540 0.126720 -0.010872 0.067244 0.235874 1.000000
In [18]:
def plot_corr(df, size=14, title='Correlation'):
    sns.set(font_scale=1.15)
    fig,ax=plt.subplots(figsize=(size,size))
    g = sns.heatmap(df.corr(),vmin=0.5, annot=True, linewidths=0.01,center=1,linecolor="white",cbar=False,square=True)
    bottom, top = g.get_ylim()
    g.set_ylim(bottom + 0.5, top - 0.5)
    g.set_xticklabels(plt.xticks()[1], rotation=60)
    plt.title(title,fontsize=12)
    ax.tick_params(labelsize=14)
    
plot_corr(cData)

High Positive corelation exist between scatter_ratio, pr.axis_rectangularity, scaled_variance1, distance_circularity, scaled_variance, radius_ratio.

High negative corelation exist with elongatedness.

We can clrealy see that following features have very low corelation, as compared to others: pr.pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_radius_of_gyration.1, skewness_about, skewness_about.1, skewness_about.2, hollows_ratio.

In [19]:
sns.pairplot(cData, hue='class')
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\nonparametric\kde.py:487: RuntimeWarning: invalid value encountered in true_divide
  binned = fast_linbin(X, a, b, gridsize) / (delta * nobs)
C:\ProgramData\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:34: RuntimeWarning: invalid value encountered in double_scalars
  FAC1 = 2*(np.pi*bw/RANGE)**2
Out[19]:
<seaborn.axisgrid.PairGrid at 0x1a220a37208>

It clearly tell us: High Positive corelation exist between scatter_ratio, pr.axis_rectangularity, scaled_variance1, distance_circularity, scaled_variance, radius_ratio.

High negative corelation exist with elongatedness.

We can clrealy see that following features have very low corelation, as compared to others: pr.pr.axis_aspect_ratio, max.length_aspect_ratio, scaled_radius_of_gyration.1, skewness_about, skewness_about.1, skewness_about.2, hollows_ratio.

  1. 'compactness', 'distance_circularity', 'radius_ratio' and 'circularity' has negative linear relation with 'elongatedness'.
  2. 'compactness' has positive relation with 'circularity', 'distance_circularity', 'radius_ratio', 'scatter_ratio', 'pr.axis_rectangularity', 'max.axis_rectangularity', 'scaled_variance1', 'scaled_variance'.
  3. 'circularity' has positive relation with distance_circularity, scatter_ratio, pr.axis_rectangularity, max.axis_rectangularity, scaled_variance, scaled_variance1, scaled_radius_of_gyration.
  4. 'distance_circularity' has positive relation with 'radius_ratio', 'scatter_ratio, 'pr.axis_rectangularity', 'max.axis_rectangularity', 'scaled_variance, 'scaled_variance1, 'scaled_radius_of_gyration'.
  5. 'radius_ratio' has positive relation with 'pr.axis_aspect_ratio', 'scatter_ratio', 'scaled_variance', 'scaled_variance1', 'scaled_radius_of_gyration'.
In [ ]:
 

Split the data

In [20]:
x = cData.iloc[:,0:18]
y = cData.iloc[:,18]


# Split x and y into training and test set in 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)

Logistic Regression Model

In [21]:
LogisticModel = LogisticRegression()
LogisticModel.fit(x_train, y_train)
LogisticModelPrediction = LogisticModel.predict(x_test)

model_score_Logistic = LogisticModel.score(x_test, y_test)
print(model_score_Logistic)
0.9448818897637795
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)
In [22]:
print('Accuracy on Training data: ',LogisticModel.score(x_train, y_train))
print('Accuracy on Testing data: ',LogisticModel.score(x_test , y_test))
print('Recall value: ',metrics.recall_score(y_test, LogisticModelPrediction, average='macro'))
print('Precision value: ',metrics.precision_score(y_test, LogisticModelPrediction, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test, LogisticModelPrediction))
print("classification Report:\n", metrics.classification_report(y_test, LogisticModelPrediction))
Accuracy on Training data:  0.9611486486486487
Accuracy on Testing data:  0.9448818897637795
Recall value:  0.9494453618261293
Precision value:  0.940409458336481
Confusion Matrix:
 [[ 66   4   1]
 [  4 117   4]
 [  0   1  57]]
classification Report:
               precision    recall  f1-score   support

           0       0.94      0.93      0.94        71
           1       0.96      0.94      0.95       125
           2       0.92      0.98      0.95        58

    accuracy                           0.94       254
   macro avg       0.94      0.95      0.94       254
weighted avg       0.95      0.94      0.94       254

In [23]:
resultsDf1 = pd.DataFrame({'Model':['Logistic'],'Accuracy': LogisticModel.score(x_test , y_test)},index={'1'})
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
Out[23]:
Model Accuracy
1 Logistic 0.944882

GaussianNB Model

In [24]:
GaussianNBModel = GaussianNB()
GaussianNBModel.fit(x_train, y_train)
GaussianNBModelPrediction = GaussianNBModel.predict(x_test)

model_score_GaussianNB = GaussianNBModel.score(x_test, y_test)
print(model_score_GaussianNB)
0.594488188976378
In [25]:
print('Accuracy on Training data: ',GaussianNBModel.score(x_train, y_train))
print('Accuracy on Testing data: ', GaussianNBModel.score(x_test , y_test))
print('Recall value: ', metrics.recall_score(y_test, GaussianNBModelPrediction, average='macro'))
print('Precision value: ', metrics.precision_score(y_test, GaussianNBModelPrediction, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(GaussianNBModelPrediction, y_test))
print("Classification Report:\n", metrics.classification_report(GaussianNBModelPrediction, y_test))
Accuracy on Training data:  0.6266891891891891
Accuracy on Testing data:  0.594488188976378
Recall value:  0.608675408774486
Precision value:  0.7405060217560218
Confusion Matrix:
 [[15  0  0]
 [16 79  1]
 [40 46 57]]
Classification Report:
               precision    recall  f1-score   support

           0       0.21      1.00      0.35        15
           1       0.63      0.82      0.71        96
           2       0.98      0.40      0.57       143

    accuracy                           0.59       254
   macro avg       0.61      0.74      0.54       254
weighted avg       0.80      0.59      0.61       254

In [26]:
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes'], 'Accuracy': GaussianNBModel.score(x_test, y_test)},index={'2'})
resultsDf1 = pd.concat([resultsDf1, tempResultsDf])
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
Out[26]:
Model Accuracy
1 Logistic 0.944882
2 Naive Bayes 0.594488

Train a Support vector machine

In [27]:
svmClassifier = SVC()
svmClassifier = svmClassifier.fit(x_train, y_train)
svmClassifierPrediction = svmClassifier.predict(x_test)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
In [28]:
print('Accuracy on Training data: ',svmClassifier.score(x_train, y_train))
print('Accuracy on Testing data: ', svmClassifier.score(x_test , y_test))
print('Recall value: ',metrics.recall_score(y_test, svmClassifierPrediction, average='macro'))
print('Precision value: ',metrics.precision_score(y_test, svmClassifierPrediction, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(svmClassifierPrediction,y_test))
print("Classification Report:\n",metrics.classification_report(svmClassifierPrediction,y_test))
Accuracy on Training data:  1.0
Accuracy on Testing data:  0.4921259842519685
Recall value:  0.3333333333333333
Precision value:  0.16404199475065617
Confusion Matrix:
 [[  0   0   0]
 [ 71 125  58]
 [  0   0   0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       1.00      0.49      0.66       254
           2       0.00      0.00      0.00         0

    accuracy                           0.49       254
   macro avg       0.33      0.16      0.22       254
weighted avg       1.00      0.49      0.66       254

C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1437: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\classification.py:1439: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples.
  'recall', 'true', average, warn_for)
In [29]:
tempResultsDf = pd.DataFrame({'Model':['SVM'], 'Accuracy': svmClassifier.score(x_test, y_test)},index={'3'})
resultsDf1 = pd.concat([resultsDf1, tempResultsDf])
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
Out[29]:
Model Accuracy
1 Logistic 0.944882
2 Naive Bayes 0.594488
3 SVM 0.492126

Perform K-fold cross validation

In [30]:
#Use the Naive Bayes CLassifier with k fold cross validation
scores = cross_val_score(svmClassifier, cData, y, cv=18)
print(scores)
print('Average score: ', np.mean(scores))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
[0.48979592 0.5        0.5106383  0.53191489 0.5106383  0.5106383
 0.5106383  0.5106383  0.5106383  0.53191489 0.5106383  0.5106383
 0.5106383  0.5106383  0.5106383  0.5        0.5        0.5       ]
Average score:  0.5094803878998407
In [31]:
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM k fold'], 'Accuracy': np.mean(scores)}, index={'3'})
resultsDf1 = pd.concat([resultsDf1, tempResultsDf])
resultsDf1 = resultsDf1[['Model','Accuracy']]
resultsDf1
Out[31]:
Model Accuracy
1 Logistic 0.944882
2 Naive Bayes 0.594488
3 SVM 0.492126
3 SVM k fold 0.509480
In [ ]:
 

PCA

In [32]:
XScaled=x.apply(zscore)
XScaled.head()
Out[32]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
0 0.160580 0.518073 0.057177 0.300945 1.933135 0.912212 -0.207598 0.136262 -0.224342 0.758332 -0.400771 -0.337407 0.285705 -0.315806 -0.032330 0.387162 -0.312012 0.183957
1 -0.325470 -0.623732 0.120741 -0.850666 -0.740596 0.427456 -0.599423 0.520519 -0.610886 -0.344578 -0.594220 -0.618623 -0.513630 0.009122 0.624090 0.161740 0.013265 0.452977
2 1.254193 0.844303 1.519141 1.265808 0.863642 0.912212 1.148719 -1.144597 0.935290 0.689401 1.114582 1.131806 1.392477 0.171586 1.718123 -0.401818 -0.149374 0.049447
3 -0.082445 -0.623732 -0.006386 -0.290423 0.328896 0.427456 -0.750125 0.648605 -0.610886 -0.344578 -0.916635 -0.739145 -1.466683 -1.453054 -0.032330 -0.289106 1.639649 1.529056
4 -1.054545 -0.134387 -0.769150 1.141310 -0.027601 -0.057300 -0.599423 0.520519 -0.610886 -0.275646 1.694930 -0.647319 0.408680 -0.072110 0.624090 -0.176395 -1.450481 -1.699181
In [33]:
plt.rcParams['figure.figsize']=(10,6)
plt.plot(cData)
plt.show()
In [34]:
plt.rcParams['figure.figsize']=(10,6)
plt.plot(XScaled)
plt.show()
In [35]:
# Calculating the covariance between attributes after scaling
covMatrix = np.cov(XScaled.T, rowvar=True)
print('Covariance Matrix \n', covMatrix)
Covariance Matrix 
 [[ 1.00118343  0.68569786  0.79086299  0.72277977  0.1930925   0.50051942
   0.81358214 -0.78968322  0.81465658  0.67694334  0.77078163  0.80712401
   0.58593517 -0.24697246  0.19754181  0.1565327   0.29889034  0.36598446]
 [ 0.68569786  1.00118343  0.79325751  0.63903532  0.20349327  0.5611334
   0.8489411  -0.82244387  0.84439802  0.96245572  0.80371846  0.82844154
   0.92691166  0.06882659  0.13651201 -0.00967793 -0.10455005  0.04640562]
 [ 0.79086299  0.79325751  1.00118343  0.79516215  0.24462154  0.66759792
   0.90614687 -0.9123854   0.89408198  0.77544391  0.87061349  0.88498924
   0.70660663 -0.22962442  0.09922417  0.26265581  0.14627113  0.33312625]
 [ 0.72277977  0.63903532  0.79516215  1.00118343  0.65132393  0.46450748
   0.77085211 -0.82636872  0.74502008  0.58015378  0.78711387  0.76115704
   0.55142559 -0.39092105  0.03579728  0.17981316  0.40632957  0.49234013]
 [ 0.1930925   0.20349327  0.24462154  0.65132393  1.00118343  0.15047265
   0.19442484 -0.29849719  0.16323988  0.14776643  0.20734569  0.19663295
   0.14876723 -0.32144977 -0.05609621 -0.02111342  0.401356    0.41622574]
 [ 0.50051942  0.5611334   0.66759792  0.46450748  0.15047265  1.00118343
   0.49133933 -0.50477756  0.48850876  0.64347365  0.40186618  0.46379685
   0.39786723 -0.33584133  0.08199536  0.14183116  0.08389276  0.41366325]
 [ 0.81358214  0.8489411   0.90614687  0.77085211  0.19442484  0.49133933
   1.00118343 -0.97275069  0.99092181  0.81004084  0.96201996  0.98160681
   0.80082111  0.01132718  0.06431825  0.21189733  0.00563439  0.1189581 ]
 [-0.78968322 -0.82244387 -0.9123854  -0.82636872 -0.29849719 -0.50477756
  -0.97275069  1.00118343 -0.95011894 -0.77677186 -0.94876596 -0.94997386
  -0.76722075  0.07848365 -0.04699819 -0.18385891 -0.11526213 -0.2171615 ]
 [ 0.81465658  0.84439802  0.89408198  0.74502008  0.16323988  0.48850876
   0.99092181 -0.95011894  1.00118343  0.81189327  0.94845027  0.97475823
   0.79763248  0.02757736  0.07321311  0.21405404 -0.01867064  0.09940372]
 [ 0.67694334  0.96245572  0.77544391  0.58015378  0.14776643  0.64347365
   0.81004084 -0.77677186  0.81189327  1.00118343  0.75110957  0.79056684
   0.86747579  0.05391989  0.13085669  0.00413356 -0.10407076  0.07686047]
 [ 0.77078163  0.80371846  0.87061349  0.78711387  0.20734569  0.40186618
   0.96201996 -0.94876596  0.94845027  0.75110957  1.00118343  0.94489677
   0.78600191  0.02585841  0.02472235  0.19735505  0.01518932  0.08643233]
 [ 0.80712401  0.82844154  0.88498924  0.76115704  0.19663295  0.46379685
   0.98160681 -0.94997386  0.97475823  0.79056684  0.94489677  1.00118343
   0.78389866  0.00939688  0.0658085   0.20518392  0.01757781  0.11978365]
 [ 0.58593517  0.92691166  0.70660663  0.55142559  0.14876723  0.39786723
   0.80082111 -0.76722075  0.79763248  0.86747579  0.78600191  0.78389866
   1.00118343  0.21553366  0.16316265 -0.05573322 -0.22471583 -0.11814142]
 [-0.24697246  0.06882659 -0.22962442 -0.39092105 -0.32144977 -0.33584133
   0.01132718  0.07848365  0.02757736  0.05391989  0.02585841  0.00939688
   0.21553366  1.00118343 -0.05782288 -0.12414277 -0.83372383 -0.90239877]
 [ 0.19754181  0.13651201  0.09922417  0.03579728 -0.05609621  0.08199536
   0.06431825 -0.04699819  0.07321311  0.13085669  0.02472235  0.0658085
   0.16316265 -0.05782288  1.00118343 -0.04178316  0.0867631   0.06269293]
 [ 0.1565327  -0.00967793  0.26265581  0.17981316 -0.02111342  0.14183116
   0.21189733 -0.18385891  0.21405404  0.00413356  0.19735505  0.20518392
  -0.05573322 -0.12414277 -0.04178316  1.00118343  0.07456104  0.20088894]
 [ 0.29889034 -0.10455005  0.14627113  0.40632957  0.401356    0.08389276
   0.00563439 -0.11526213 -0.01867064 -0.10407076  0.01518932  0.01757781
  -0.22471583 -0.83372383  0.0867631   0.07456104  1.00118343  0.89363767]
 [ 0.36598446  0.04640562  0.33312625  0.49234013  0.41622574  0.41366325
   0.1189581  -0.2171615   0.09940372  0.07686047  0.08643233  0.11978365
  -0.11814142 -0.90239877  0.06269293  0.20088894  0.89363767  1.00118343]]
In [36]:
eigenvalues, eigenvectors = np.linalg.eig(covMatrix)
print('Eigen Vectors \n%s', eigenvectors)
print('\n Eigen Values \n%s', eigenvalues)
Eigen Vectors 
%s [[-2.72502890e-01 -8.70435783e-02  3.81852075e-02  1.38675013e-01
  -1.37101466e-01  2.63611383e-01  2.02717114e-01 -7.58796410e-01
   3.66685918e-01  1.60045219e-01  8.40252779e-02  2.14645175e-02
  -1.87350749e-02  6.89082276e-02  4.26105276e-02  9.97784975e-02
  -8.22590084e-02 -3.30366937e-02]
 [-2.87254690e-01  1.31621757e-01  2.01146908e-01 -3.80554832e-02
   1.38995553e-01 -7.13474241e-02 -3.92275358e-01 -6.76034223e-02
   5.53261885e-02 -1.82323962e-01 -3.65229874e-02  1.47247511e-01
  -4.89102355e-02  5.90534770e-02 -6.74107885e-01  1.63466948e-01
  -2.59100771e-01  2.48832011e-01]
 [-3.02421105e-01 -4.61430061e-02 -6.34621085e-02  1.08954287e-01
   8.00174278e-02 -1.69006151e-02  1.63371282e-01  2.77371950e-01
   7.46784853e-02  2.73033778e-01  4.68505530e-01  6.52730855e-01
   4.74162132e-03 -1.62108150e-01 -4.99754439e-04 -6.36582307e-02
   1.20629778e-01  9.80561531e-02]
 [-2.69713545e-01 -1.97931263e-01 -5.62851689e-02 -2.54355087e-01
  -1.33744367e-01 -1.38183653e-01  1.61910525e-01  1.10544748e-01
   2.66666666e-01 -5.05987218e-02 -5.45526034e-01  7.52188680e-02
   3.70499547e-03 -3.93288246e-01  1.74861248e-01 -1.33284415e-01
  -1.86241567e-01  3.60765151e-01]
 [-9.78607336e-02 -2.57839952e-01  6.19927464e-02 -6.12765722e-01
  -1.23601456e-01 -5.77828612e-01  9.27633094e-02 -1.86858758e-01
  -3.86296562e-02 -3.43037888e-02  2.65023238e-01 -2.40287269e-02
   8.90928349e-03  1.63771153e-01 -6.31976228e-02  2.14665592e-02
   1.24639367e-01 -1.77647590e-01]
 [-1.95200137e-01 -1.08045626e-01  1.48957820e-01  2.78678159e-01
   6.34893356e-01 -2.89096995e-01  3.98266293e-01 -4.62187969e-02
  -1.37163365e-01  1.77960797e-01 -1.92846020e-01 -2.29741488e-01
   4.09727876e-03  1.36576102e-01 -9.62482815e-02 -6.89934316e-02
   1.40804371e-01  9.99006987e-02]
 [-3.10523932e-01  7.52853487e-02 -1.09042833e-01  5.39294828e-03
  -8.55574543e-02  9.77471088e-02  9.23519412e-02  6.46204209e-02
  -1.31567659e-01 -1.43132644e-01  9.67172431e-02 -1.53118496e-01
   8.55513044e-01  6.48917601e-02 -4.36596954e-02 -1.56585696e-01
  -1.43109720e-01 -5.28457504e-02]
 [ 3.09006904e-01 -1.32299375e-02  9.08526930e-02  6.52148575e-02
   7.90734442e-02 -7.57282937e-02 -1.04070600e-01 -1.92342823e-01
   2.89633509e-01 -7.93831124e-02 -2.29926427e-02  2.33454000e-02
   2.61858734e-01 -4.96273257e-01 -3.08568675e-01 -2.44030327e-01
   5.11966770e-01 -9.49906147e-02]
 [-3.07287000e-01  8.75601978e-02 -1.06070496e-01  3.08991500e-02
  -8.16463820e-02  1.05403228e-01  9.31317767e-02  1.38684573e-02
  -8.95291026e-02 -2.39896699e-01  1.59356923e-01 -2.17636238e-01
  -4.22479708e-01 -1.13664100e-01 -1.63739102e-01 -6.71547392e-01
  -6.75916711e-02 -2.16727165e-01]
 [-2.78154157e-01  1.22154240e-01  2.13684693e-01  4.14674720e-02
   2.51112937e-01 -7.81962142e-02 -3.54564344e-01 -2.15163418e-01
  -1.58231983e-01 -3.82739482e-01 -1.42837015e-01  3.15261003e-01
   2.00493082e-02 -8.66067604e-03  5.08763287e-01 -5.00643538e-02
   1.60926059e-01 -2.00262071e-01]
 [-2.99765086e-01  7.72657535e-02 -1.44599805e-01 -6.40050869e-02
  -1.47471227e-01  1.32912405e-01  6.80546125e-02  1.95678724e-01
   4.27034669e-02  1.66090908e-01 -4.59667614e-01  1.18383161e-01
  -4.15194745e-02  1.35985919e-01 -2.52182911e-01  2.17416166e-01
   3.24139804e-01 -5.53139002e-01]
 [-3.05532374e-01  7.15030171e-02 -1.10343735e-01 -2.19687048e-03
  -1.10100984e-01  1.15398218e-01  9.01194270e-02  3.77948210e-02
  -1.51072666e-01 -2.87457686e-01  2.09345615e-01 -3.31340876e-01
  -1.22365190e-01 -2.42922436e-01  3.94502237e-02  4.48936624e-01
   4.62827872e-01  3.22499534e-01]
 [-2.63237620e-01  2.10582046e-01  2.02870191e-01 -8.55396458e-02
   5.21210685e-03 -6.70573978e-02 -4.55292717e-01  1.46752664e-01
   2.63771332e-01  5.49626527e-01  1.07713508e-01 -3.99260390e-01
   1.66056546e-02 -3.30876118e-02  2.03029913e-01 -1.06621517e-01
   8.55669069e-02  2.40609291e-02]
 [ 4.19359352e-02  5.03621577e-01 -7.38640211e-02 -1.15399624e-01
  -1.38068605e-01 -1.31513077e-01  8.58226790e-02 -3.30394999e-01
  -5.55267166e-01  3.62547303e-01 -1.26596148e-01  1.21942784e-01
   1.27186667e-03 -2.96030848e-01 -5.79407509e-02 -3.08034829e-02
  -5.10909842e-02  8.79644677e-02]
 [-3.60832115e-02 -1.57663214e-02  5.59173987e-01  4.73703309e-01
  -5.66552244e-01 -3.19176094e-01  1.24532179e-01  1.14255395e-01
  -5.99039250e-02 -5.79891873e-02 -3.25785780e-02  2.88590518e-03
  -4.24341185e-04  4.01635562e-03 -8.22261600e-03  2.05544442e-02
  -4.39201991e-03 -3.76172016e-02]
 [-5.87204797e-02 -9.27462386e-02 -6.70680496e-01  4.28426032e-01
  -1.30869817e-01 -4.68404967e-01 -3.02517700e-01 -1.15403870e-01
   5.23845772e-02  1.28995278e-02 -3.62255133e-02 -1.62495314e-02
  -9.40554994e-03  8.00562035e-02  1.12172401e-02 -2.31296836e-03
   1.13702813e-02  4.44850199e-02]
 [-3.80131449e-02 -5.01621218e-01  6.22407145e-02 -2.74095968e-02
  -1.80519293e-01  2.80136438e-01 -2.58250261e-01 -9.46599623e-02
  -3.79168935e-01  1.87848521e-01 -1.38657118e-01  8.24506703e-02
   2.60800892e-02  2.45816461e-01 -7.88567114e-02 -2.81093089e-01
   3.19960307e-01  3.19055407e-01]
 [-8.47399995e-02 -5.07612106e-01  4.17053530e-02  9.60374943e-02
   1.10788067e-01  5.94444089e-02 -1.73269228e-01 -6.49718344e-03
  -2.80340510e-01  1.33402674e-01  8.39926899e-02 -1.29951586e-01
  -4.18109835e-03 -5.18420304e-01 -3.18514877e-02  2.41164948e-01
  -3.10989286e-01 -3.65128378e-01]]

 Eigen Values 
%s [9.74940269e+00 3.35071912e+00 1.19238155e+00 1.13381916e+00
 8.83997312e-01 6.66265745e-01 3.18150910e-01 2.28179142e-01
 1.31018595e-01 7.98619108e-02 7.33979478e-02 6.46162669e-02
 5.16287320e-03 4.01448646e-02 1.98136761e-02 2.27005257e-02
 3.22758478e-02 2.93936408e-02]
In [37]:
# Step 3 (continued): Sort eigenvalues in descending order

# Make a set of (eigenvalue, eigenvector) pairs
eig_pairs = [(eigenvalues[index], eigenvectors[:,index]) for index in range(len(eigenvalues))]

# Sort the (eigenvalue, eigenvector) pairs from highest to lowest with respect to eigenvalue
eig_pairs.sort()

eig_pairs.reverse()
print(eig_pairs)

# Extract the descending ordered eigenvalues and eigenvectors
eigvalues_sorted = [eig_pairs[index][0] for index in range(len(eigenvalues))]
eigvectors_sorted = [eig_pairs[index][1] for index in range(len(eigenvalues))]

# Let's confirm our sorting worked, print out eigenvalues
print('Eigenvalues in descending order: \n%s' %eigvalues_sorted)
[(9.749402689379597, array([-0.27250289, -0.28725469, -0.30242111, -0.26971354, -0.09786073,
       -0.19520014, -0.31052393,  0.3090069 , -0.307287  , -0.27815416,
       -0.29976509, -0.30553237, -0.26323762,  0.04193594, -0.03608321,
       -0.05872048, -0.03801314, -0.08474   ])), (3.350719119412978, array([-0.08704358,  0.13162176, -0.04614301, -0.19793126, -0.25783995,
       -0.10804563,  0.07528535, -0.01322994,  0.0875602 ,  0.12215424,
        0.07726575,  0.07150302,  0.21058205,  0.50362158, -0.01576632,
       -0.09274624, -0.50162122, -0.50761211])), (1.1923815452731639, array([ 0.03818521,  0.20114691, -0.06346211, -0.05628517,  0.06199275,
        0.14895782, -0.10904283,  0.09085269, -0.1060705 ,  0.21368469,
       -0.1445998 , -0.11034374,  0.20287019, -0.07386402,  0.55917399,
       -0.6706805 ,  0.06224071,  0.04170535])), (1.1338191632147838, array([ 0.13867501, -0.03805548,  0.10895429, -0.25435509, -0.61276572,
        0.27867816,  0.00539295,  0.06521486,  0.03089915,  0.04146747,
       -0.06400509, -0.00219687, -0.08553965, -0.11539962,  0.47370331,
        0.42842603, -0.0274096 ,  0.09603749])), (0.8839973120036095, array([-0.13710147,  0.13899555,  0.08001743, -0.13374437, -0.12360146,
        0.63489336, -0.08555745,  0.07907344, -0.08164638,  0.25111294,
       -0.14747123, -0.11010098,  0.00521211, -0.1380686 , -0.56655224,
       -0.13086982, -0.18051929,  0.11078807])), (0.6662657454310769, array([ 0.26361138, -0.07134742, -0.01690062, -0.13818365, -0.57782861,
       -0.289097  ,  0.09774711, -0.07572829,  0.10540323, -0.07819621,
        0.1329124 ,  0.11539822, -0.0670574 , -0.13151308, -0.31917609,
       -0.46840497,  0.28013644,  0.05944441])), (0.31815090958438486, array([ 0.20271711, -0.39227536,  0.16337128,  0.16191053,  0.09276331,
        0.39826629,  0.09235194, -0.1040706 ,  0.09313178, -0.35456434,
        0.06805461,  0.09011943, -0.45529272,  0.08582268,  0.12453218,
       -0.3025177 , -0.25825026, -0.17326923])), (0.2281791421155407, array([-0.75879641, -0.06760342,  0.27737195,  0.11054475, -0.18685876,
       -0.0462188 ,  0.06462042, -0.19234282,  0.01386846, -0.21516342,
        0.19567872,  0.03779482,  0.14675266, -0.330395  ,  0.1142554 ,
       -0.11540387, -0.09465996, -0.00649718])), (0.13101859512585473, array([ 0.36668592,  0.05532619,  0.07467849,  0.26666667, -0.03862966,
       -0.13716337, -0.13156766,  0.28963351, -0.0895291 , -0.15823198,
        0.04270347, -0.15107267,  0.26377133, -0.55526717, -0.05990393,
        0.05238458, -0.37916894, -0.28034051])), (0.07986191082036508, array([ 0.16004522, -0.18232396,  0.27303378, -0.05059872, -0.03430379,
        0.1779608 , -0.14313264, -0.07938311, -0.2398967 , -0.38273948,
        0.16609091, -0.28745769,  0.54962653,  0.3625473 , -0.05798919,
        0.01289953,  0.18784852,  0.13340267])), (0.07339794782509106, array([ 0.08402528, -0.03652299,  0.46850553, -0.54552603,  0.26502324,
       -0.19284602,  0.09671724, -0.02299264,  0.15935692, -0.14283702,
       -0.45966761,  0.20934562,  0.10771351, -0.12659615, -0.03257858,
       -0.03622551, -0.13865712,  0.08399269])), (0.06461626687535525, array([ 0.02146452,  0.14724751,  0.65273085,  0.07521887, -0.02402873,
       -0.22974149, -0.1531185 ,  0.0233454 , -0.21763624,  0.315261  ,
        0.11838316, -0.33134088, -0.39926039,  0.12194278,  0.00288591,
       -0.01624953,  0.08245067, -0.12995159])), (0.04014486457709953, array([ 0.06890823,  0.05905348, -0.16210815, -0.39328825,  0.16377115,
        0.1365761 ,  0.06489176, -0.49627326, -0.1136641 , -0.00866068,
        0.13598592, -0.24292244, -0.03308761, -0.29603085,  0.00401636,
        0.0800562 ,  0.24581646, -0.5184203 ])), (0.032275847766898305, array([-0.08225901, -0.25910077,  0.12062978, -0.18624157,  0.12463937,
        0.14080437, -0.14310972,  0.51196677, -0.06759167,  0.16092606,
        0.3241398 ,  0.46282787,  0.08556691, -0.05109098, -0.00439202,
        0.01137028,  0.31996031, -0.31098929])), (0.02939364075031221, array([-0.03303669,  0.24883201,  0.09805615,  0.36076515, -0.17764759,
        0.0999007 , -0.05284575, -0.09499061, -0.21672717, -0.20026207,
       -0.553139  ,  0.32249953,  0.02406093,  0.08796447, -0.0376172 ,
        0.04448502,  0.31905541, -0.36512838])), (0.022700525706219703, array([ 0.0997785 ,  0.16346695, -0.06365823, -0.13328441,  0.02146656,
       -0.06899343, -0.1565857 , -0.24403033, -0.67154739, -0.05006435,
        0.21741617,  0.44893662, -0.10662152, -0.03080348,  0.02055444,
       -0.00231297, -0.28109309,  0.24116495])), (0.019813676080863922, array([ 4.26105276e-02, -6.74107885e-01, -4.99754439e-04,  1.74861248e-01,
       -6.31976228e-02, -9.62482815e-02, -4.36596954e-02, -3.08568675e-01,
       -1.63739102e-01,  5.08763287e-01, -2.52182911e-01,  3.94502237e-02,
        2.03029913e-01, -5.79407509e-02, -8.22261600e-03,  1.12172401e-02,
       -7.88567114e-02, -3.18514877e-02])), (0.0051628732047457404, array([-1.87350749e-02, -4.89102355e-02,  4.74162132e-03,  3.70499547e-03,
        8.90928349e-03,  4.09727876e-03,  8.55513044e-01,  2.61858734e-01,
       -4.22479708e-01,  2.00493082e-02, -4.15194745e-02, -1.22365190e-01,
        1.66056546e-02,  1.27186667e-03, -4.24341185e-04, -9.40554994e-03,
        2.60800892e-02, -4.18109835e-03]))]
Eigenvalues in descending order: 
[9.749402689379597, 3.350719119412978, 1.1923815452731639, 1.1338191632147838, 0.8839973120036095, 0.6662657454310769, 0.31815090958438486, 0.2281791421155407, 0.13101859512585473, 0.07986191082036508, 0.07339794782509106, 0.06461626687535525, 0.04014486457709953, 0.032275847766898305, 0.02939364075031221, 0.022700525706219703, 0.019813676080863922, 0.0051628732047457404]
In [38]:
tot = sum(eigenvalues)
var_explained = [( i /tot ) * 100 for i in sorted(eigenvalues, reverse=True)]
cum_var_exp = np.cumsum(var_explained)
print("Cumulative Variance Explained", cum_var_exp)
plt.plot(var_explained)
Cumulative Variance Explained [ 54.0993254   72.69242795  79.30893968  85.60048941  90.50578051
  94.2028816   95.96829741  97.23446089  97.96148159  98.40463444
  98.81191882  99.17047375  99.39323715  99.57233547  99.73544045
  99.86140541  99.97135127 100.        ]
Out[38]:
[<matplotlib.lines.Line2D at 0x1a236c14708>]
In [39]:
# Ploting 
plt.figure(figsize=(8 , 7))
plt.bar(range(1, eigenvalues.size + 1), var_explained, alpha = 0.5, align = 'center', label = 'Individual explained variance')
plt.step(range(1, eigenvalues.size + 1), cum_var_exp, where='mid', label = 'Cumulative explained variance')
plt.ylabel('Explained Variance Ratio')
plt.xlabel('Principal Components')
plt.legend(loc = 'best')
plt.tight_layout()
plt.show()

First 10 components catures arround 98.5% data, so we can eliminate last 7 components.

In [40]:
# Reducing from 17 to 10 dimension space
pca = PCA(n_components=10)
data_reduced = pca.fit_transform(XScaled)
data_reduced.transpose()
Out[40]:
array([[ 0.58422804, -1.5121798 ,  3.91344816, ...,  5.12009307,
        -3.29709502, -4.96759448],
       [-0.67567325, -0.34893367,  0.2345073 , ..., -0.18227007,
        -1.10194286,  0.42274968],
       [-0.45333356, -0.33343619, -1.26509352, ..., -0.50836783,
         1.93384417,  1.30871531],
       ...,
       [-0.68196902,  0.10442512,  0.17305277, ..., -0.38820845,
         0.45880709, -0.21433678],
       [ 0.31266966, -0.29625823,  0.19108534, ..., -0.07735512,
         0.82142229,  0.59676772],
       [ 0.14411602, -0.39097765, -0.52948668, ...,  0.55527162,
        -0.34059305,  0.10856429]])
In [41]:
pca.components_
Out[41]:
array([[ 0.27250289,  0.28725469,  0.30242111,  0.26971354,  0.09786073,
         0.19520014,  0.31052393, -0.3090069 ,  0.307287  ,  0.27815416,
         0.29976509,  0.30553237,  0.26323762, -0.04193594,  0.03608321,
         0.05872048,  0.03801314,  0.08474   ],
       [-0.08704358,  0.13162176, -0.04614301, -0.19793126, -0.25783995,
        -0.10804563,  0.07528535, -0.01322994,  0.0875602 ,  0.12215424,
         0.07726575,  0.07150302,  0.21058205,  0.50362158, -0.01576632,
        -0.09274624, -0.50162122, -0.50761211],
       [-0.03818521, -0.20114691,  0.06346211,  0.05628517, -0.06199275,
        -0.14895782,  0.10904283, -0.09085269,  0.1060705 , -0.21368469,
         0.1445998 ,  0.11034374, -0.20287019,  0.07386402, -0.55917399,
         0.6706805 , -0.06224071, -0.04170535],
       [ 0.13867501, -0.03805548,  0.10895429, -0.25435509, -0.61276572,
         0.27867816,  0.00539295,  0.06521486,  0.03089915,  0.04146747,
        -0.06400509, -0.00219687, -0.08553965, -0.11539962,  0.47370331,
         0.42842603, -0.0274096 ,  0.09603749],
       [ 0.13710147, -0.13899555, -0.08001743,  0.13374437,  0.12360146,
        -0.63489336,  0.08555745, -0.07907344,  0.08164638, -0.25111294,
         0.14747123,  0.11010098, -0.00521211,  0.1380686 ,  0.56655224,
         0.13086982,  0.18051929, -0.11078807],
       [ 0.26361138, -0.07134742, -0.01690062, -0.13818365, -0.57782861,
        -0.289097  ,  0.09774711, -0.07572829,  0.10540323, -0.07819621,
         0.1329124 ,  0.11539822, -0.0670574 , -0.13151308, -0.31917609,
        -0.46840497,  0.28013644,  0.05944441],
       [ 0.20271711, -0.39227536,  0.16337128,  0.16191053,  0.09276331,
         0.39826629,  0.09235194, -0.1040706 ,  0.09313178, -0.35456434,
         0.06805461,  0.09011943, -0.45529272,  0.08582268,  0.12453218,
        -0.3025177 , -0.25825026, -0.17326923],
       [-0.75879641, -0.06760342,  0.27737195,  0.11054475, -0.18685876,
        -0.0462188 ,  0.06462042, -0.19234282,  0.01386846, -0.21516342,
         0.19567872,  0.03779482,  0.14675266, -0.330395  ,  0.1142554 ,
        -0.11540387, -0.09465996, -0.00649718],
       [ 0.36668592,  0.05532619,  0.07467849,  0.26666667, -0.03862966,
        -0.13716337, -0.13156766,  0.28963351, -0.0895291 , -0.15823198,
         0.04270347, -0.15107267,  0.26377133, -0.55526717, -0.05990393,
         0.05238458, -0.37916894, -0.28034051],
       [-0.16004522,  0.18232396, -0.27303378,  0.05059872,  0.03430379,
        -0.1779608 ,  0.14313264,  0.07938311,  0.2398967 ,  0.38273948,
        -0.16609091,  0.28745769, -0.54962653, -0.3625473 ,  0.05798919,
        -0.01289953, -0.18784852, -0.13340267]])
In [42]:
X_comp = pd.DataFrame(pca.components_,columns=list(XScaled))
X_comp.head()
Out[42]:
compactness circularity distance_circularity radius_ratio pr.axis_aspect_ratio max.length_aspect_ratio scatter_ratio elongatedness pr.axis_rectangularity max.length_rectangularity scaled_variance scaled_variance.1 scaled_radius_of_gyration scaled_radius_of_gyration.1 skewness_about skewness_about.1 skewness_about.2 hollows_ratio
0 0.272503 0.287255 0.302421 0.269714 0.097861 0.195200 0.310524 -0.309007 0.307287 0.278154 0.299765 0.305532 0.263238 -0.041936 0.036083 0.058720 0.038013 0.084740
1 -0.087044 0.131622 -0.046143 -0.197931 -0.257840 -0.108046 0.075285 -0.013230 0.087560 0.122154 0.077266 0.071503 0.210582 0.503622 -0.015766 -0.092746 -0.501621 -0.507612
2 -0.038185 -0.201147 0.063462 0.056285 -0.061993 -0.148958 0.109043 -0.090853 0.106070 -0.213685 0.144600 0.110344 -0.202870 0.073864 -0.559174 0.670680 -0.062241 -0.041705
3 0.138675 -0.038055 0.108954 -0.254355 -0.612766 0.278678 0.005393 0.065215 0.030899 0.041467 -0.064005 -0.002197 -0.085540 -0.115400 0.473703 0.428426 -0.027410 0.096037
4 0.137101 -0.138996 -0.080017 0.133744 0.123601 -0.634893 0.085557 -0.079073 0.081646 -0.251113 0.147471 0.110101 -0.005212 0.138069 0.566552 0.130870 0.180519 -0.110788

P_reduce represents reduced mathematical space. Reducing from 17 to 10 dimension space :

In [43]:
P_reduce = np.array(eigenvectors[0:10])   

# projecting original data into principal component dimensions
X_std_10D = np.dot(XScaled,P_reduce.T)   

# converting array to dataframe for pairplot
Proj_data_df = pd.DataFrame(X_std_10D)

sns.pairplot(Proj_data_df, diag_kind='kde')
Out[43]:
<seaborn.axisgrid.PairGrid at 0x1a236aee048>

Maximum independet attributes have no/very low corelation. Some still have corelation and we have them here even after dimentionality reduction.

Build Models again

Splitting the data

In [44]:
# Split X and y into training and test set in 70:30 ratio
x_train1, x_test1, y_train1, y_test1 = train_test_split(Proj_data_df,y, test_size = 0.3, random_state = 10)

Logistic Regression Model

In [45]:
LogisticModel1 = LogisticRegression()
LogisticModel1.fit(x_train1, y_train1)
LogisticModelPrediction1 = LogisticModel1.predict(x_test1)

model_score_Logistic1 = LogisticModel1.score(x_test1, y_test1)
print(model_score_Logistic1)
0.8503937007874016
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:432: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.
  FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\logistic.py:469: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.
  "this warning.", FutureWarning)
In [46]:
print('Accuracy on Training data: ',LogisticModel1.score(x_train1, y_train1))
print('Accuracy on Testing data: ',LogisticModel1.score(x_test1 , y_test1))
print('Recall value: ',metrics.recall_score(y_test1, LogisticModelPrediction1, average='macro'))
print('Precision value: ',metrics.precision_score(y_test1, LogisticModelPrediction1, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(y_test1, LogisticModelPrediction1))
print("classification Report:\n", metrics.classification_report(y_test1, LogisticModelPrediction1))
Accuracy on Training data:  0.8597972972972973
Accuracy on Testing data:  0.8503937007874016
Recall value:  0.853741298364902
Precision value:  0.8394342435655995
Confusion Matrix:
 [[ 61   7   3]
 [  9 105  11]
 [  2   6  50]]
classification Report:
               precision    recall  f1-score   support

           0       0.85      0.86      0.85        71
           1       0.89      0.84      0.86       125
           2       0.78      0.86      0.82        58

    accuracy                           0.85       254
   macro avg       0.84      0.85      0.85       254
weighted avg       0.85      0.85      0.85       254

In [47]:
resultsDf2=pd.DataFrame({'Model':['Logistic'],'Accuracy': LogisticModel1.score(x_test1 , y_test1)},index={'1'})
resultsDf2=resultsDf2[['Model','Accuracy']]
resultsDf2
Out[47]:
Model Accuracy
1 Logistic 0.850394

GaussianNB Model

In [48]:
GaussianNBModel1 = GaussianNB()
GaussianNBModel1.fit(x_train1, y_train1)
GaussianNBModelPrediction1 = GaussianNBModel1.predict(x_test1)

model_score_GaussianNB1 = GaussianNBModel1.score(x_test1, y_test1)
print(model_score_GaussianNB1)
0.6417322834645669
In [49]:
print('Accuracy on Training data: ',GaussianNBModel1.score(x_train1, y_train1))
print('Accuracy on Testing data: ', GaussianNBModel1.score(x_test1, y_test1))
print('Recall value: ', metrics.recall_score(y_test1, GaussianNBModelPrediction1, average='macro'))
print('Precision value: ', metrics.precision_score(y_test1, GaussianNBModelPrediction1, average='macro'))
print("Confusion Matrix:\n", metrics.confusion_matrix(GaussianNBModelPrediction1, y_test1))
print("Classification Report:\n", metrics.classification_report(GaussianNBModelPrediction1, y_test1))
Accuracy on Training data:  0.6469594594594594
Accuracy on Testing data:  0.6417322834645669
Recall value:  0.6618565646754088
Precision value:  0.7433265993265993
Confusion Matrix:
 [[30  0  0]
 [16 79  4]
 [25 46 54]]
Classification Report:
               precision    recall  f1-score   support

           0       0.42      1.00      0.59        30
           1       0.63      0.80      0.71        99
           2       0.93      0.43      0.59       125

    accuracy                           0.64       254
   macro avg       0.66      0.74      0.63       254
weighted avg       0.75      0.64      0.64       254

In [50]:
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['Naive Bayes'], 'Accuracy': GaussianNBModel1.score(x_test1, y_test1)},index={'2'})
resultsDf2 = pd.concat([resultsDf2, tempResultsDf])
resultsDf2 = resultsDf2[['Model','Accuracy']]
resultsDf2
Out[50]:
Model Accuracy
1 Logistic 0.850394
2 Naive Bayes 0.641732

Train a Support vector machine

In [51]:
svmClassifier1 = SVC()
svmClassifier1 = svmClassifier1.fit(x_train1, y_train1)
svmClassifierPrediction1 = svmClassifier1.predict(x_test1)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
In [52]:
print('Accuracy on Training data: ',svmClassifier1.score(x_train1, y_train1))
print('Accuracy on Testing data: ', svmClassifier1.score(x_test1 , y_test1))
print('Recall value: ',metrics.recall_score(y_test1, svmClassifierPrediction1, average='macro'))
print('Precision value: ',metrics.precision_score(y_test1, svmClassifierPrediction1, average='macro'))
print("Confusion Matrix:\n",metrics.confusion_matrix(svmClassifierPrediction1,y_test1))
print("Classification Report:\n",metrics.classification_report(svmClassifierPrediction1,y_test1))
Accuracy on Training data:  0.956081081081081
Accuracy on Testing data:  0.937007874015748
Recall value:  0.934870649182451
Precision value:  0.9311268479985285
Confusion Matrix:
 [[ 66   2   1]
 [  3 118   3]
 [  2   5  54]]
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94        69
           1       0.94      0.95      0.95       124
           2       0.93      0.89      0.91        61

    accuracy                           0.94       254
   macro avg       0.93      0.93      0.93       254
weighted avg       0.94      0.94      0.94       254

In [53]:
tempResultsDf = pd.DataFrame({'Model':['SVM'], 'Accuracy': svmClassifier1.score(x_test1, y_test1)},index={'3'})
resultsDf2 = pd.concat([resultsDf2, tempResultsDf])
resultsDf2 = resultsDf2[['Model','Accuracy']]
resultsDf2
Out[53]:
Model Accuracy
1 Logistic 0.850394
2 Naive Bayes 0.641732
3 SVM 0.937008

Perform K-fold cross validation

In [54]:
#Use the Naive Bayes CLassifier with k fold cross validation
scores = cross_val_score(svmClassifier1, Proj_data_df, y, cv=18)
print(scores)
print('Average score: ', np.mean(scores))
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
[0.89795918 0.89583333 0.93617021 1.         0.89361702 1.
 0.93617021 0.91489362 0.93617021 1.         0.95744681 0.91489362
 0.85106383 0.9787234  0.91489362 0.97826087 0.95652174 0.95652174]
Average score:  0.9399521898902433
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
In [55]:
#Store the accuracy results for each kernel in a dataframe for final comparison
tempResultsDf = pd.DataFrame({'Model':['SVM k fold'], 'Accuracy': np.mean(scores)}, index={'3'})
resultsDf2 = pd.concat([resultsDf2, tempResultsDf])
resultsDf2 = resultsDf2[['Model','Accuracy']]
In [56]:
print("Before PCA:")
resultsDf1
Before PCA:
Out[56]:
Model Accuracy
1 Logistic 0.944882
2 Naive Bayes 0.594488
3 SVM 0.492126
3 SVM k fold 0.509480
In [57]:
print("After PCA:")
resultsDf2
After PCA:
Out[57]:
Model Accuracy
1 Logistic 0.850394
2 Naive Bayes 0.641732
3 SVM 0.937008
3 SVM k fold 0.939952

We can conclude that:

  1. Before reducing attributes: Logistic Regression performed better
  2. After reducing attributes from 18 to 10
  3. SVM and SVM k fold performed bertter.
In [ ]:
 
In [ ]:
 
In [ ]: